from requests_html import HTMLSession
import pyperclip
import os
import re
import time

'''
https://www.aikanshuba.net
http://www.dbqu365.com/47/47206/
'''

# 获取请求对象
session = HTMLSession()
url = pyperclip.paste()
# 往网站发送get请求
page = session.get(url)
page.encoding = 'utf-8'
div = page.html.find('div.ml_list', first=True)
content = div.find('ul', first=True)
name = page.html.find('h1', first=True).text
# 下载地址
path = r'D:\.爬取为txt'

a_s = content.find('a')
# a_s = a_s[::-1]
for i, a in enumerate(a_s):
    href = a.attrs['href']
    chap = a.text
    chap = chap.replace('.', ' ')
    cp = re.sub('^\d+', '第' + str(i + 1) + '章 ', chap)
    # cp = chap.sub('\d', '第'+str(i+1)+'章 ')

    if i > 2:
        print('href ', i, url + href, cp)
        _page = session.get(url + href)
        _page.encoding = 'utf-8'
        text = _page.html.find('p#articlecontent', first=True).text
        with open(f'{path}\ ' + f'{name}.txt', 'a', encoding='utf-8') as f:
            f.write('\n')
            f.write(cp)
            f.write('\n\n')
            f.write(text)
        time.sleep(0.5)